Basic data visualization of beatmap info provided by osu! API. https://github.com/ppy/osu-api/wiki Only uses ranked/loved/qualified maps. Graphs focusing on standard mode. (A few maps, due to ranked/loved irregularities, are actually in graveyard but included in the API query anyway. Ex. https://osu.ppy.sh/b/766190&m=2 with 1 loved CtB diff. )
library(ggplot2)
library(gridExtra)
library(plyr)
library(jsonlite)
library(varhandle)
# Import data from JSON and remove duplicate rows
beatmaps <- unique(do.call("rbind", fromJSON("maps.json")))
# Convert strings of ints and floats to numeric datatypes
beatmaps.isintcol <- sapply(beatmaps, function(col) all(check.numeric(col, only.integer=TRUE)))
beatmaps.isnumcol <- sapply(beatmaps, function(col) all(check.numeric(col))) & !beatmaps.isintcol
beatmaps[, beatmaps.isintcol] = sapply(beatmaps[, beatmaps.isintcol], as.integer)
beatmaps[, beatmaps.isnumcol] = sapply(beatmaps[, beatmaps.isnumcol], as.numeric)
# Convert MySQL datetimes to R datetimes. As far as the new site goes, this appears to be UTC-4, but
# this may be a local thing. Probably broken.
beatmaps$approved_date <- as.POSIXct(beatmaps$approved_date, tz="Etc/GMT+4")
beatmaps$last_update <- as.POSIXct(beatmaps$last_update, tz="Etc/GMT+4")
# Create labels and data frames for each gamemode
gamemodes <- c("std", "taiko", "ctb", "mania")
gamemode.labels <- c("Standard", "Taiko", "CtB", "Mania")
beatmaps$mode <- factor(beatmaps$mode, labels=gamemode.labels)
for (i in 1:4) {
assign(gamemodes[i], beatmaps[beatmaps$mode == gamemode.labels[i],])
}
# Various plot parameters for convenience
# These will usually leave a few outlier maps out
diff_x_scale <- scale_x_continuous(limits=c(0,10), breaks=seq(0,10,0.5))
diff_hist <- geom_histogram(binwidth=0.05)
length_x_scale <- scale_x_continuous(limits=c(0,600), breaks=seq(0,600,30))
length_hist <- geom_histogram(binwidth=1)
AR_y_scale <- scale_y_continuous(breaks=seq(0,10))
SR_y_scale <- scale_y_continuous(limits=c(0,10), breaks=seq(0,10,1))
approved_x_scale <- scale_x_datetime(date_breaks="1 year", date_labels="%Y")
legend_title_fill <- labs(fill="Mode")
legend_title_color <- labs(color="Mode")
# Center titles
theme_update(plot.title = element_text(hjust = 0.5))
# Histogram of star rating (all modes)
ggplot(beatmaps, aes(difficultyrating, fill=as.factor(mode))) +
ggtitle("Total Star Rating (All Modes)") + legend_title_fill +
diff_x_scale + diff_hist
## Warning: Removed 61 rows containing non-finite values (stat_bin).
# Frequency polygon of SR (all modes)
ggplot(beatmaps, aes(difficultyrating, color=as.factor(mode))) +
ggtitle("Star Rating (All Modes)") + legend_title_color +
diff_x_scale + geom_freqpoly(binwidth=0.05)
## Warning: Removed 61 rows containing non-finite values (stat_bin).
## Warning: Removed 8 rows containing missing values (geom_path).
# Histograms of SR (all modes) by faceting
ggplot(beatmaps, aes(difficultyrating)) +
diff_x_scale + diff_hist +
facet_wrap("mode") +
labs(title = "Star Rating Distributions (All Modes)")
## Warning: Removed 61 rows containing non-finite values (stat_bin).
# Histograms of SR (all modes) with separate y scales
diffplots0 <- ggplot(std , aes(difficultyrating)) + diff_x_scale + diff_hist + ggtitle("Standard")
diffplots1 <- ggplot(taiko, aes(difficultyrating)) + diff_x_scale + diff_hist + ggtitle("Taiko")
diffplots2 <- ggplot(ctb , aes(difficultyrating)) + diff_x_scale + diff_hist + ggtitle("CtB")
diffplots3 <- ggplot(mania, aes(difficultyrating)) + diff_x_scale + diff_hist + ggtitle("Mania")
grid.arrange(diffplots0, diffplots1, diffplots2, diffplots3,
top="Star Rating Distributions (All Modes)")
## Warning: Removed 9 rows containing non-finite values (stat_bin).
## Warning: Removed 15 rows containing non-finite values (stat_bin).
## Warning: Removed 24 rows containing non-finite values (stat_bin).
## Warning: Removed 13 rows containing non-finite values (stat_bin).
# Histogram of total length (all modes)
ggplot(beatmaps, aes(total_length, fill=as.factor(mode))) +
ggtitle("Total Beatmap Length (All Modes)") + legend_title_fill +
length_x_scale + length_hist
## Warning: Removed 117 rows containing non-finite values (stat_bin).
# Frequency polygon of total length (all modes)
ggplot(beatmaps, aes(total_length, color=as.factor(mode))) +
ggtitle("Beatmap Length (All Modes)") + legend_title_color +
length_x_scale + geom_freqpoly(binwidth=1)
## Warning: Removed 117 rows containing non-finite values (stat_bin).
## Warning: Removed 8 rows containing missing values (geom_path).
# Histograms of total length (all modes)
lengthplots0 <- ggplot(std , aes(total_length)) + length_x_scale + length_hist + ggtitle("Standard")
lengthplots1 <- ggplot(taiko, aes(total_length)) + length_x_scale + length_hist + ggtitle("Taiko")
lengthplots2 <- ggplot(ctb , aes(total_length)) + length_x_scale + length_hist + ggtitle("CtB")
lengthplots3 <- ggplot(mania, aes(total_length)) + length_x_scale + length_hist + ggtitle("Mania")
grid.arrange(lengthplots0, lengthplots1, lengthplots2, lengthplots3,
top="Beatmap Length Distributions (All Modes)")
## Warning: Removed 88 rows containing non-finite values (stat_bin).
## Warning: Removed 16 rows containing non-finite values (stat_bin).
## Warning: Removed 3 rows containing non-finite values (stat_bin).
## Warning: Removed 10 rows containing non-finite values (stat_bin).
# Frequency polygons of playcount (all modes)
ggplot(beatmaps, aes(playcount, color=as.factor(mode))) +
ggtitle("Playcount (All Modes)") + legend_title_color +
scale_x_continuous(limits=c(0,1000000)) +
geom_freqpoly(binwidth=5000)
## Warning: Removed 1019 rows containing non-finite values (stat_bin).
## Warning: Removed 8 rows containing missing values (geom_path).
# Histogram of date approved (all modes)
ggplot(beatmaps, aes(approved_date, fill=as.factor(mode))) +
ggtitle("Total Date Approved (All Modes)") + legend_title_fill +
approved_x_scale + geom_histogram(binwidth=3600*24*30)
# Frequency polygon of date approved (all modes)
ggplot(beatmaps, aes(approved_date, color=as.factor(mode))) +
ggtitle("Date Approved (All Modes)") + legend_title_color +
approved_x_scale + geom_freqpoly(binwidth=3600*24*30)
# Most frequent artists, titles, sources, and creators
head(sort(table(beatmaps$artist), decreasing=TRUE), 10)
##
## Hatsune Miku ClariS KOTOKO fripSide
## 653 421 398 362
## xi senya IOSYS Various Artists
## 349 339 331 329
## LiSA yanaginagi
## 323 311
head(sort(table(beatmaps$title), decreasing=TRUE), 10)
##
## Piano 7K BMS Pack
## 193
## Piano Beatmap Set
## 114
## Harumachi Clover
## 70
## Granat
## 65
## Ai no Scenario
## 62
## PEPPY FIX TAIKO STAR RATING PLEASE for a happier
## 61
## Tokyo (Innovaderz Remix)
## 61
## MIIRO
## 58
## Hitorigoto -TV MIX-
## 55
## Re:TrymenT
## 54
head(sort(table(beatmaps$source), decreasing=TRUE), 10)
##
## Touhou
## 24652 2370
## BMS 東方Project
## 1557 754
## SOUND VOLTEX III GRAVITY WARS DJMAX
## 466 455
## beatmania IIDX SOUND VOLTEX II -infinite infection-
## 398 390
## osu! Vocaloid
## 258 242
head(sort(table(beatmaps$creator), decreasing=TRUE), 10)
##
## osuplayer111 Sotarks DJPop tutuhaha ztrot
## 572 571 563 387 377
## Larto Natsu Monstrata Ascendance pishifat
## 354 345 331 311 311
# Most favorited mapsets (mapset defined by creator, artist, and title)
mapsets <- beatmaps[!duplicated(beatmaps[c("creator","artist","title")]),]
most_fav <- head(arrange(mapsets, desc(favourite_count)), 20)
subset(most_fav, select=c("favourite_count","creator","title"))
## favourite_count creator title
## 1 13163 W h i t e My Love
## 2 10292 Fort Highscore
## 3 9763 jonathanlfj Tear Rain
## 4 8282 Charles445 Liquid (Paul Rosenthal Remix)
## 5 7604 VINXIS No title
## 6 6886 Ekoro Everything will freeze
## 7 6880 Kuria Guren no Yumiya (TV Size)
## 8 6606 Doormat Hitorigoto -TV MIX-
## 9 6534 Awaken Toumei Elegy
## 10 6335 Voltaeyx Mayday (feat. Laura Brehm)
## 11 5510 Takuya Pika Girl
## 12 5425 Bearizm Cold Green Eyes ft. Roos Denayer
## 13 5300 Saten-san Kokou no Sousei
## 14 4878 gowww MATRYOSHKA
## 15 4849 h3k1ru River Flows In You (A Love Note)
## 16 4829 ouranhshc Bad Apple!!
## 17 4829 -kevincela- Flaklypa
## 18 4770 ktgster Lost
## 19 4618 Sekai-nyan This game (TV Size)
## 20 4591 Kuria Answer is Near
# Most played maps
most_played <- head(arrange(beatmaps, desc(playcount)), 20)
subset(most_played, select=c("playcount","creator","title","version"))
## playcount creator title
## 1 21429513 W h i t e My Love
## 2 20405157 W h i t e My Love
## 3 18188427 jonathanlfj Tear Rain
## 4 14209863 Blue Dragon The Big Black
## 5 12840626 jonathanlfj Tear Rain
## 6 12456963 ktgster Lost
## 7 12124613 -kevincela- Flaklypa
## 8 10881187 Charles445 Liquid (Paul Rosenthal Remix)
## 9 10878775 Blue Dragon Can't Defeat Airman
## 10 10611576 -kevincela- Flaklypa
## 11 9287944 W h i t e My Love
## 12 8065171 Multiple Creators Renatus
## 13 8028482 jonathanlfj Tear Rain
## 14 7715600 ktgster Lost
## 15 7548246 Rue Dear You
## 16 7343941 Bearizm Cold Green Eyes ft. Roos Denayer
## 17 7328010 h3k1ru River Flows In You (A Love Note)
## 18 6820213 VINXIS No title
## 19 6818829 Charles445 Liquid (Paul Rosenthal Remix)
## 20 6684374 val0108 Scarlet Rose
## version
## 1 Hard
## 2 Normal
## 3 Normal
## 4 WHO'S AFRAID OF THE BIG BLACK
## 5 Hard
## 6 Normal
## 7 Normal
## 8 Easy
## 9 Holy Shit! It's Airman!!
## 10 Hard
## 11 Insane
## 12 Normal
## 13 Insane
## 14 Hard
## 15 Dear Rue
## 16 Divine
## 17 Love Note
## 18 Light Insane
## 19 Normal
## 20 0108 style
# Scatterplot of AR vs BPM
ggplot(std, aes(bpm, diff_approach)) +
ggtitle("Approach Rate vs BPM") +
scale_x_continuous(limits=c(0,500)) +
AR_y_scale +
geom_point(alpha=0.1)
## Warning: Removed 124 rows containing missing values (geom_point).
# Scatterplot of SR vs total length time
ggplot(std, aes(total_length, difficultyrating)) +
ggtitle("Star Rating vs Total Length") +
length_x_scale +
SR_y_scale +
geom_point(alpha=0.1)
## Warning: Removed 97 rows containing missing values (geom_point).
# Scatterplot of max combo vs drain time
ggplot(std, aes(hit_length, max_combo)) +
ggtitle("Max Combo vs Drain Time") +
length_x_scale +
scale_y_continuous(limits=c(0,4000)) +
geom_point(alpha=0.05)
## Warning: Removed 73 rows containing missing values (geom_point).
# High linear correlation, as expected
summary(lm(max_combo ~ hit_length, data=std))
##
## Call:
## lm(formula = max_combo ~ hit_length, data = std)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3860.7 -134.3 -17.4 120.5 23020.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -99.20894 2.51272 -39.48 <2e-16 ***
## hit_length 4.89613 0.01715 285.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 272.8 on 56923 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.5888, Adjusted R-squared: 0.5888
## F-statistic: 8.152e+04 on 1 and 56923 DF, p-value: < 2.2e-16
# Scatterplot of favorite count vs playcount
ggplot(std, aes(playcount, favourite_count)) +
ggtitle("Favorite Count vs Playcount") +
scale_x_continuous(limits=c(0,1000000)) +
scale_y_continuous(limits=c(0,1000)) +
geom_point(alpha=0.05)
## Warning: Removed 2579 rows containing missing values (geom_point).
# Scatterplot of playcount vs total length
ggplot(std, aes(total_length, playcount)) +
ggtitle("Playcount vs Total Length") +
length_x_scale +
scale_y_continuous(limits=c(0,1000000)) +
geom_point(alpha=0.1)
## Warning: Removed 1099 rows containing missing values (geom_point).
# Scatterplot of AR vs date approved
ggplot(std, aes(approved_date, diff_approach)) +
ggtitle("Approach Rate vs Date Approved") +
AR_y_scale +
approved_x_scale +
geom_point(alpha=0.05)
# Scatterplot of SR vs date approved
ggplot(std, aes(approved_date, difficultyrating)) +
ggtitle("Star Rating vs Date Approved") +
SR_y_scale +
approved_x_scale +
geom_point(alpha=0.1)
## Warning: Removed 9 rows containing missing values (geom_point).
# Playcount by song time, categorized by spread icon
# https://osu.ppy.sh/help/wiki/Difficulties#star-rating Not sure about values between boundaries
spread.sr = c(0, 1.51, 2.26, 3.76, 5.26, 6.76)
spread.names = c("Easy", "Normal", "Hard", "Insane", "Expert", "Expert+")
spread.colors = c("olivedrab3", "paleturquoise", "gold", "hotpink", "purple", "darkgray")
# Assign difficulty rating by spread ranges to spread names
beatmaps$spread_name = spread.names[cut(beatmaps$difficultyrating, spread.sr, right=FALSE, labels=FALSE)]
std <- beatmaps[beatmaps$mode == "Standard",] # update std
hitlength.bins = seq(0, 360, 30)
par(mfrow=c(2,3), mar=c(4,4,4,1), cex.main=2)
for (i in 1:length(spread.names)) {
std.spread = std[std$spread_name == spread.names[i], ]
playcount.bin.sum = sapply(split(std.spread, cut(std.spread$hit_length, hitlength.bins)),
function(df) sum(df$playcount))
barplot(playcount.bin.sum, space=0, width=30, xlab="Hit length (s)", ylab="Playcount Total", main=spread.names[i],
col=spread.colors[i], axisnames=FALSE)
axis(1, at=hitlength.bins)
}
# Same but 150+ hitlength and stacked bars
hitlength.bins.150 = seq(150, 360, 30)
playcount.bin.mat = matrix(ncol=length(hitlength.bins.150)-1, nrow=length(spread.names))
colnames(playcount.bin.mat) = head(hitlength.bins.150, -1)
rownames(playcount.bin.mat) = spread.names
for (i in 1:nrow(playcount.bin.mat)) {
std.spread = std[std$spread_name == spread.names[i], ]
playcount.bin.mat[i,] = sapply(split(std.spread, cut(std.spread$hit_length, hitlength.bins.150)),
function(df) sum(df$playcount))
}
dev.off() # Reset par
## null device
## 1
barplot(playcount.bin.mat, space=0, width=30, col=spread.colors, xlab="Hitlength (s)", ylab="Total Playcount",
legend.text=spread.names, axisnames=FALSE, main="Total Playcount by Hitlength and Difficulty")
axis(1, at=hitlength.bins.150-hitlength.bins.150[1], labels=hitlength.bins.150)